- Collaboration and sharing with
git - Scripting your analysis
- Keeping your code, data and documentation all together
2017-03-17
git(Slides and material can be found at http://dicook.github.io/Monash-R/.)
If R were an airplane, RStudio would be the airport, providing many, many supporting services that make it easier for you, the pilot, to take off and go to awesome places. Sure, you can fly an airplane without an airport, but having those runways and supporting infrastructure is a game-changer.
Create a project to contain all of the material covered in this set of tutorials:
read.csv(C:/Documents and Settings/Adele/Honours/data/SkyFall.csv, simply address data as read.csv("data/SkyFall.csv")setwd(), getwd()Save the Rmd file into the "student_files" directory
R Markdown is an authoring format that enables easy creation of dynamic documents, presentations, and reports from R. It combines the core syntax of markdown (an easy-to-write plain text format) with embedded R code chunks that are run so their output can be included in the final document. R Markdown documents are fully reproducible (they can be automatically regenerated whenever underlying R code or data changes).
RStudio's Rmarkdown cheatsheet gives a nice, concise overview of its capabilities.
RStudio's reference guide lists its options.
index.Rmd which generates these slides in RStudiodata(economics, package = "ggplot2") # data frames are essentially a list of vectors str(economics) #> Classes 'tbl_df', 'tbl' and 'data.frame': 574 obs. of 6 variables: #> $ date : Date, format: "1967-07-01" "1967-08-01" ... #> $ pce : num 507 510 516 513 518 ... #> $ pop : int 198712 198911 199113 199311 199498 199657 199808 199920 200056 200208 ... #> $ psavert : num 12.5 12.5 11.7 12.5 12.5 12.1 11.7 12.2 11.6 12.2 ... #> $ uempmed : num 4.5 4.7 4.6 4.9 4.7 4.8 5.1 4.5 4.1 4.6 ... #> $ unemploy: int 2944 2945 2958 3143 3066 3018 2878 3001 2877 2709 ...
Read the documentation for economics. Can you think of a interesting/informative function of these variable(s)?
library(ggplot2) p <- ggplot(economics, aes(date, unemploy / pop)) + geom_line() p
ggplot2 builds plots using a grammarp p + geom_smooth(method = "lm", se = F) p + geom_smooth(method = "loess", se = F) p + geom_smooth(method = "gam", formula = y ~ s(x, bs = "cr"), se = F)
geom_smooth() work?m <- lm((unemploy / pop) ~ date, data = economics) str(m) #> List of 12 #> $ coefficients : Named num [1:2] 2.60e-02 4.95e-07 #> ..- attr(*, "names")= chr [1:2] "(Intercept)" "date" #> $ residuals : Named num [1:574] -0.01076 -0.01078 -0.01075 -0.00985 -0.01026 ... #> ..- attr(*, "names")= chr [1:574] "1" "2" "3" "4" ... #> $ effects : Named num [1:574] -0.71605 0.0598 -0.00965 -0.00875 -0.00917 ... #> ..- attr(*, "names")= chr [1:574] "(Intercept)" "date" "" "" ... #> $ rank : int 2 #> $ fitted.values: Named num [1:574] 0.0256 0.0256 0.0256 0.0256 0.0256 ... #> ..- attr(*, "names")= chr [1:574] "1" "2" "3" "4" ... #> $ assign : int [1:2] 0 1 #> $ qr :List of 5 #> ..$ qr : num [1:574, 1:2] -23.9583 0.0417 0.0417 0.0417 0.0417 ... #> .. ..- attr(*, "dimnames")=List of 2 #> .. .. ..$ : chr [1:574] "1" "2" "3" "4" ... #> .. .. ..$ : chr [1:2] "(Intercept)" "date" #> .. ..- attr(*, "assign")= int [1:2] 0 1 #> ..$ qraux: num [1:2] 1.04 1.07 #> ..$ pivot: int [1:2] 1 2 #> ..$ tol : num 1e-07 #> ..$ rank : int 2 #> ..- attr(*, "class")= chr "qr" #> $ df.residual : int 572 #> $ xlevels : Named list() #> $ call : language lm(formula = (unemploy/pop) ~ date, data = economics) #> $ terms :Classes 'terms', 'formula' language (unemploy/pop) ~ date #> .. ..- attr(*, "variables")= language list((unemploy/pop), date) #> .. ..- attr(*, "factors")= int [1:2, 1] 0 1 #> .. .. ..- attr(*, "dimnames")=List of 2 #> .. .. .. ..$ : chr [1:2] "(unemploy/pop)" "date" #> .. .. .. ..$ : chr "date" #> .. ..- attr(*, "term.labels")= chr "date" #> .. ..- attr(*, "order")= int 1 #> .. ..- attr(*, "intercept")= int 1 #> .. ..- attr(*, "response")= int 1 #> .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> #> .. ..- attr(*, "predvars")= language list((unemploy/pop), date) #> .. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "other" #> .. .. ..- attr(*, "names")= chr [1:2] "(unemploy/pop)" "date" #> $ model :'data.frame': 574 obs. of 2 variables: #> ..$ (unemploy/pop): num [1:574] 0.0148 0.0148 0.0149 0.0158 0.0154 ... #> ..$ date : Date[1:574], format: "1967-07-01" ... #> ..- attr(*, "terms")=Classes 'terms', 'formula' language (unemploy/pop) ~ date #> .. .. ..- attr(*, "variables")= language list((unemploy/pop), date) #> .. .. ..- attr(*, "factors")= int [1:2, 1] 0 1 #> .. .. .. ..- attr(*, "dimnames")=List of 2 #> .. .. .. .. ..$ : chr [1:2] "(unemploy/pop)" "date" #> .. .. .. .. ..$ : chr "date" #> .. .. ..- attr(*, "term.labels")= chr "date" #> .. .. ..- attr(*, "order")= int 1 #> .. .. ..- attr(*, "intercept")= int 1 #> .. .. ..- attr(*, "response")= int 1 #> .. .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> #> .. .. ..- attr(*, "predvars")= language list((unemploy/pop), date) #> .. .. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "other" #> .. .. .. ..- attr(*, "names")= chr [1:2] "(unemploy/pop)" "date" #> - attr(*, "class")= chr "lm"
economics$yhat <- m$fitted.values p <- ggplot(economics) + geom_line(aes(date, unemploy / pop)) + geom_line(aes(date, yhat), color = "blue") p
library(plotly) ggplotly(p)
Tidy data is organised so that variables and observations are clearly identified.
library(tidyr)
library(readr)
tb <- read_csv("../data/tb.csv")
head(tb)
#> # A tibble: 6 × 22
#> iso2 year m_04 m_514 m_014 m_1524 m_2534 m_3544 m_4554 m_5564 m_65
#> <chr> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
#> 1 AD 1989 NA NA NA NA NA NA NA NA NA
#> 2 AD 1990 NA NA NA NA NA NA NA NA NA
#> 3 AD 1991 NA NA NA NA NA NA NA NA NA
#> 4 AD 1992 NA NA NA NA NA NA NA NA NA
#> 5 AD 1993 NA NA NA NA NA NA NA NA NA
#> 6 AD 1994 NA NA NA NA NA NA NA NA NA
#> # ... with 11 more variables: m_u <int>, f_04 <int>, f_514 <int>,
#> # f_014 <int>, f_1524 <int>, f_2534 <int>, f_3544 <int>, f_4554 <int>,
#> # f_5564 <int>, f_65 <int>, f_u <int>
str(tb) #> Classes 'tbl_df', 'tbl' and 'data.frame': 5769 obs. of 22 variables: #> $ iso2 : chr "AD" "AD" "AD" "AD" ... #> $ year : int 1989 1990 1991 1992 1993 1994 1996 1997 1998 1999 ... #> $ m_04 : int NA NA NA NA NA NA NA NA NA NA ... #> $ m_514 : int NA NA NA NA NA NA NA NA NA NA ... #> $ m_014 : int NA NA NA NA NA NA 0 0 0 0 ... #> $ m_1524: int NA NA NA NA NA NA 0 0 0 0 ... #> $ m_2534: int NA NA NA NA NA NA 0 1 0 0 ... #> $ m_3544: int NA NA NA NA NA NA 4 2 1 1 ... #> $ m_4554: int NA NA NA NA NA NA 1 2 0 1 ... #> $ m_5564: int NA NA NA NA NA NA 0 1 0 0 ... #> $ m_65 : int NA NA NA NA NA NA 0 6 0 0 ... #> $ m_u : int NA NA NA NA NA NA NA NA NA NA ... #> $ f_04 : int NA NA NA NA NA NA NA NA NA NA ... #> $ f_514 : int NA NA NA NA NA NA NA NA NA NA ... #> $ f_014 : int NA NA NA NA NA NA 0 0 NA 0 ... #> $ f_1524: int NA NA NA NA NA NA 1 1 NA 0 ... #> $ f_2534: int NA NA NA NA NA NA 1 2 NA 0 ... #> $ f_3544: int NA NA NA NA NA NA 0 3 NA 1 ... #> $ f_4554: int NA NA NA NA NA NA 0 0 NA 0 ... #> $ f_5564: int NA NA NA NA NA NA 1 0 NA 0 ... #> $ f_65 : int NA NA NA NA NA NA 0 1 NA 0 ... #> $ f_u : int NA NA NA NA NA NA NA NA NA NA ...
What do you think m_04, m_514, m_014, … mean?
tb %>% gather(var, count, -year, -iso2) #> # A tibble: 115,380 × 4 #> iso2 year var count #> <chr> <int> <chr> <int> #> 1 AD 1989 m_04 NA #> 2 AD 1990 m_04 NA #> 3 AD 1991 m_04 NA #> 4 AD 1992 m_04 NA #> 5 AD 1993 m_04 NA #> 6 AD 1994 m_04 NA #> 7 AD 1996 m_04 NA #> 8 AD 1997 m_04 NA #> 9 AD 1998 m_04 NA #> 10 AD 1999 m_04 NA #> # ... with 115,370 more rows
tb %>% gather(var, count, -year, -iso2) %>%
separate(var, c("gender", "age"))
#> # A tibble: 115,380 × 5
#> iso2 year gender age count
#> * <chr> <int> <chr> <chr> <int>
#> 1 AD 1989 m 04 NA
#> 2 AD 1990 m 04 NA
#> 3 AD 1991 m 04 NA
#> 4 AD 1992 m 04 NA
#> 5 AD 1993 m 04 NA
#> 6 AD 1994 m 04 NA
#> 7 AD 1996 m 04 NA
#> 8 AD 1997 m 04 NA
#> 9 AD 1998 m 04 NA
#> 10 AD 1999 m 04 NA
#> # ... with 115,370 more rows
library(dplyr)
tb_long <- tb %>% gather(var, count, -year, -iso2) %>%
separate(var, c("gender", "age"))
tb_long %>% group_by(year, gender) %>% tally()
#> Source: local data frame [58 x 3]
#> Groups: year [?]
#>
#> year gender n
#> <int> <chr> <int>
#> 1 1980 f 1910
#> 2 1980 m 1910
#> 3 1981 f 1940
#> 4 1981 m 1940
#> 5 1982 f 1940
#> 6 1982 m 1940
#> 7 1983 f 1960
#> 8 1983 m 1960
#> 9 1984 f 1930
#> 10 1984 m 1930
#> # ... with 48 more rows
library(babynames) head(babynames) #> # A tibble: 6 × 5 #> year sex name n prop #> <dbl> <chr> <chr> <int> <dbl> #> 1 1880 F Mary 7065 0.07238359 #> 2 1880 F Anna 2604 0.02667896 #> 3 1880 F Emma 2003 0.02052149 #> 4 1880 F Elizabeth 1939 0.01986579 #> 5 1880 F Minnie 1746 0.01788843 #> 6 1880 F Margaret 1578 0.01616720 dim(babynames) #> [1] 1825433 5
bb_path <- tempfile(fileext = ".csv", tmpdir = ".") write_csv(babynames, bb_path) read_csv(bb_path) #> # A tibble: 1,825,433 × 5 #> year sex name n prop #> <int> <chr> <chr> <int> <dbl> #> 1 1880 F Mary 7065 0.07238359 #> 2 1880 F Anna 2604 0.02667896 #> 3 1880 F Emma 2003 0.02052149 #> 4 1880 F Elizabeth 1939 0.01986579 #> 5 1880 F Minnie 1746 0.01788843 #> 6 1880 F Margaret 1578 0.01616720 #> 7 1880 F Ida 1472 0.01508119 #> 8 1880 F Alice 1414 0.01448696 #> 9 1880 F Bertha 1320 0.01352390 #> 10 1880 F Sarah 1288 0.01319605 #> # ... with 1,825,423 more rows
library(readxl)
read_excel("my-spreadsheet.xls", sheet = "data")
read_excel("my-spreadsheet.xls", sheet = 2)
library(haven)
# SAS files
read_sas("path/to/file")
# SPSS files
read_por("path/to/file")
read_sav("path/to/file")
# Stata files
read_dta("path/to/file")
db <- src_sqlite("babynames.sqlite3", create = TRUE)
if (!db_has_table(db$con, "babynames")) {
copy_to(db, babynames)
}
db #> src: sqlite 3.8.6 [babynames.sqlite3] #> tbls: babynames, sqlite_stat1 tbl(db, "babynames") #> Source: query [?? x 5] #> Database: sqlite 3.8.6 [babynames.sqlite3] #> #> year sex name n prop #> <dbl> <chr> <chr> <int> <dbl> #> 1 1880 F Mary 7065 0.07238359 #> 2 1880 F Anna 2604 0.02667896 #> 3 1880 F Emma 2003 0.02052149 #> 4 1880 F Elizabeth 1939 0.01986579 #> 5 1880 F Minnie 1746 0.01788843 #> 6 1880 F Margaret 1578 0.01616720 #> 7 1880 F Ida 1472 0.01508119 #> 8 1880 F Alice 1414 0.01448696 #> 9 1880 F Bertha 1320 0.01352390 #> 10 1880 F Sarah 1288 0.01319605 #> # ... with more rows
h <- db %>%
tbl("babynames") %>%
filter(name == "Hilary")
class(h) #> [1] "tbl_sqlite" "tbl_sql" "tbl_lazy" "tbl" h$query #> NULL # execute SQL query and bring into R hc <- collect(h) class(hc) #> [1] "tbl_df" "tbl" "data.frame" hc #> # A tibble: 190 × 5 #> year sex name n prop #> <dbl> <chr> <chr> <int> <dbl> #> 1 1882 M Hilary 7 5.736153e-05 #> 2 1883 M Hilary 6 5.334282e-05 #> 3 1887 M Hilary 7 6.403571e-05 #> 4 1891 M Hilary 8 7.321314e-05 #> 5 1896 M Hilary 6 4.648496e-05 #> 6 1897 M Hilary 5 4.100276e-05 #> 7 1898 M Hilary 5 3.784811e-05 #> 8 1902 M Hilary 8 6.026411e-05 #> 9 1904 M Hilary 5 3.609900e-05 #> 10 1905 M Hilary 6 4.188628e-05 #> # ... with 180 more rows
Extracted from http://openexchangerates.org, extracted using the json api, with the R package, jsonlite.
library(readr)
rates <- read_csv("../data/rates.csv")
rates[1:5,1:8]
#> # A tibble: 5 × 8
#> date AED AFN ALL AMD ANG AOA ARS
#> <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 2015-02-23 3.672900 57.33792 123.8969 478.692 1.78968 105.9075 8.702166
#> 2 2015-02-24 3.672069 57.35200 123.7132 478.608 1.78958 106.1014 8.696728
#> 3 2015-02-25 3.673324 57.32655 123.5259 478.616 1.78954 106.1913 8.715239
#> 4 2015-02-26 3.673028 57.52745 124.5801 480.294 1.78956 106.3042 8.720107
#> 5 2015-02-27 3.672648 57.33172 124.8491 478.812 1.78958 106.3389 8.721236
If you'd like to collect exchange rates yourself, see here.
library(ggmap)
library(ggthemes)
m <- get_map(location=c(145.13, -37.9150), zoom=14, scale=1)
#ggmap(m)
b <- as.numeric(attr(m, "bb"))
m_df <- data.frame(lon=(rep(1:640, 640)-1)/640*(b[4]-b[2])+b[2],
lat=(rep(640:1, rep(640, 640))-1)/640*(b[3]-b[1])+b[1],
fill=as.vector(m))
ggplot() + geom_tile(data=m_df, aes(x=lon, y=lat, fill=fill)) +
scale_fill_identity() +
theme_map() + theme(legend.position="none", aspect.ratio=1)
#> Warning: `panel.margin` is deprecated. Please use `panel.spacing` property
#> instead